package spark.code.study.rdd.operator

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

/**
  * Created by peibin on 2017/6/2.
  */
object ReduceByKey {

  def main(args: Array[String]): Unit = {

    val spark = SparkSession
      .builder
      .appName("prepare sql")
      .master("local[*]")
      .getOrCreate()

    import spark.implicits._

    val slices = if (args.length > 0) args(0).toInt else 20
    val n = math.min(100000L * slices, Int.MaxValue).toInt
    // avoid overflow
    val input: RDD[(Int, Int)] = spark.sparkContext.parallelize(1 until n, slices).map { i =>
      (i / 10, i)
    }
    input.combineByKey(v => v,
      (acc: Int, v: Int) => acc + v,
      (acc1: Int, acc2: Int) => acc1 + acc2).toDF().show(10)

    input.reduceByKey((acc1: Int, acc2: Int) => acc1 + acc2).toDF().show(10)

  }
}
